1 Required R libraries

if (!require("pacman")) install.packages("pacman")
## Indlæser krævet pakke: pacman
if (!require("aamisc")){
  pacman::p_load("qvalue", "rain", "limma", "devtools")
  url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
  pkgFile <- "HarmonicRegression_1.0.tar.gz"
  download.file(url = url, destfile = pkgFile)
  install.packages(pkgs=pkgFile, type="source", repos=NULL)
  file.remove(pkgFile)
}
## Indlæser krævet pakke: aamisc
pacman::p_load(
  "edgeR", "readr", "readxl", "biomaRt", "magrittr", "tibble", "stringr", 
  "ggplot2", "data.table", "patchwork", "openxlsx", "aamisc", "limma", 
  "devtools", "dplyr", "RColorBrewer", "ggrepel", "ComplexUpset", "tidyr")

# Color scheme and shapes for figures
RA_colors <- c("#003049", "#0073af")
LA_colors <- c("#d62828", "#e77d7d")
RV_colors <- c("#ff7f00", "#ffb266")
LV_colors <- c("#fcbf49", "#fee2ad")

# Shape 16 (circle) for Control and Shape 17 (triangle) for AF
condition_shapes <- c("Control" = 16, "AF" = 17)
chamber_colors <- c("RA" = "#003049", "LA" = "#d62828", "RV" = "#ff7f00", "LV" = "#fcbf49")
chamber_fill_colors <- c("RA" = "#0073af", "LA" = "#e77d7d", "RV" = "#ffb266", "LV" = "#fee2ad")

2 Read data

2.1 Count matrix and metadata

# Load count data from a local TSV file and metadata through Excel
count_file <- "../../../data/count/gene-expression-all-reverse-stranded-countReadPairs.tsv"
count <- readr::read_delim(count_file)
## Rows: 38849 Columns: 54
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (5): Geneid, Chr, Start, End, Strand
## dbl (49): Length, Dorado_LA_post, Dorado_LVFW, Dorado_RA_post, Dorado_RVFW, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
meta_file <- "../../../data/metadata/meta.xlsx" 
meta <- readxl::read_excel(meta_file)

# Gene annotation
geneinfo_file <- "../../../data/gene_annotation/horse_gene_annotation.tsv.gz"
geneinfo <- fread(geneinfo_file)
colnames(geneinfo)
##  [1] "Gene stable ID"                             
##  [2] "Gene stable ID version"                     
##  [3] "Gene description"                           
##  [4] "Chromosome/scaffold name"                   
##  [5] "Gene start (bp)"                            
##  [6] "Gene end (bp)"                              
##  [7] "Strand"                                     
##  [8] "Gene name"                                  
##  [9] "NCBI gene (formerly Entrezgene) ID"         
## [10] "NCBI gene (formerly Entrezgene) description"
# Rename columns
setnames(geneinfo, new = c("ENSEMBL", "ENSEMBLv", "Description_detailed", 
                           "Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
# Remove unnecessary columns
geneinfo <- geneinfo[, c("ENSEMBLv", "Description_detailed") := NULL]

# Remove duplicate entries based on ENSEMBL ID
geneinfo <- geneinfo[!duplicated(ENSEMBL), ]

# Convert annotation data to data frame and set rownames to ENSEMBL IDs
annot <- merge(x = count[,c("Geneid", "Length")], 
               y = geneinfo, 
               by.x = "Geneid",
               by.y = "ENSEMBL", 
               all.x = TRUE, 
               all.y = FALSE)
setnames(annot, old = "Geneid", new = "ENSEMBL")
annot <- data.frame(annot)
rownames(annot) <- annot$ENSEMBL

fwrite(x = annot, 
       file = "../../../data/gene_annotation/horse_gene_annotation_filtered.tsv.gz", 
       sep = "\t")

# Clean metadata and ensure it matches the count data
meta <- meta %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
head(meta)
##                         Horse Group Region Training status Condition
## Dorado_LA_post         Dorado    AF     LA         Unknown     LA_AF
## Dorado_RA_post         Dorado    AF     RA         Unknown     RA_AF
## San_Diego_LA_post   San Diego    AF     LA         Unknown     LA_AF
## San_Diego_RA_post   San Diego    AF     RA         Unknown     RA_AF
## Kevin_Cook_LA_post Kevin Cook    AF     LA         Unknown     LA_AF
## Kevin_Cook_RA_post Kevin Cook    AF     RA         Unknown     RA_AF
count <- count[, -c(2:6)]

# Set "Geneid" as rownames and remove from the data frame for easy manipulation
count <- count %>% remove_rownames %>% column_to_rownames(var="Geneid")
head(count)
##                    Dorado_LA_post Dorado_LVFW Dorado_RA_post Dorado_RVFW
## ENSECAG00000013298            109         267            195         246
## ENSECAG00000019402              0           0              0           0
## ENSECAG00000052423            573         844            943         904
## ENSECAG00000011978              2          13              7          10
## ENSECAG00000005166            143         233            206         273
## ENSECAG00000032916            383         442            623         397
##                    Im_A_Mets_Fan_LA_post Im_A_Mets_Fan_LVFW
## ENSECAG00000013298                   204                255
## ENSECAG00000019402                     0                  0
## ENSECAG00000052423                  1094               1080
## ENSECAG00000011978                    21                  6
## ENSECAG00000005166                   249                262
## ENSECAG00000032916                   439                530
##                    Im_a_Mets_Fan_RA_post Im_a_Mets_Fan_RVFW Jytte_LA_post
## ENSECAG00000013298                   259                195           241
## ENSECAG00000019402                     0                  0             0
## ENSECAG00000052423                   931                879          1139
## ENSECAG00000011978                    12                  4            11
## ENSECAG00000005166                   323                212           226
## ENSECAG00000032916                   794                438           604
##                    Jytte_LVFW Jytte_RA_post Jytte_RVFW Kevin_Cook_LA_post
## ENSECAG00000013298        309           320        339                296
## ENSECAG00000019402          0             0          0                  0
## ENSECAG00000052423       1051           897       1088               1608
## ENSECAG00000011978         11            13         15                 15
## ENSECAG00000005166        299           313        302                316
## ENSECAG00000032916        463           548        494                747
##                    Kevin_Cook_LVFW Kevin_Cook_RA_post Kevin_Cook_RVFW
## ENSECAG00000013298             316                284             323
## ENSECAG00000019402               0                  0               0
## ENSECAG00000052423            1341               1089            1060
## ENSECAG00000011978              11                 17               5
## ENSECAG00000005166             218                332             246
## ENSECAG00000032916             635                659             642
##                    MAP11_LA_post MAP11_LV_midwall MAP11_RApost MAP11_RV_endo
## ENSECAG00000013298           215              330          223           230
## ENSECAG00000019402             0                0            0             0
## ENSECAG00000052423          1015             1022          918           646
## ENSECAG00000011978            13               16            9            14
## ENSECAG00000005166           299              244          374           239
## ENSECAG00000032916           449              497          474           343
##                    MAP3_LA_post MAP3_LV_midwall MAP3_RApost MAP3_RV_endo
## ENSECAG00000013298          239             328         240          266
## ENSECAG00000019402            0               0           0            0
## ENSECAG00000052423          877            1180         876          784
## ENSECAG00000011978            6               9           9           12
## ENSECAG00000005166          300             287         350          258
## ENSECAG00000032916          419             514         469          313
##                    MAP4_LA_post MAP4_LV_midwall MAP4_RApost MAP4_RV_endo
## ENSECAG00000013298          189             310         283          254
## ENSECAG00000019402            0               0           0            0
## ENSECAG00000052423         1058            1026         925          910
## ENSECAG00000011978            9              17          21           11
## ENSECAG00000005166          323             306         414          225
## ENSECAG00000032916          460             577         569          395
##                    MAP6_LA_post MAP6_LV_midwall MAP6_RApost MAP6_RV_endo
## ENSECAG00000013298          316             319         238          279
## ENSECAG00000019402            0               0           0            0
## ENSECAG00000052423         1038             765         765          617
## ENSECAG00000011978           11              10          12           13
## ENSECAG00000005166          365             251         315          255
## ENSECAG00000032916          482             379         380          275
##                    MAP8_LA_post MAP8_RV_endo MAP8_RApost MAP8_LV_midwall
## ENSECAG00000013298          252          267         191             246
## ENSECAG00000019402            0            0           0               0
## ENSECAG00000052423         1486          717        1025             902
## ENSECAG00000011978           15           14          12              15
## ENSECAG00000005166          456          249         355             275
## ENSECAG00000032916          643          356         478             453
##                    MAP9_LA_post MAP9_LV_midwall MAP9_RApost MAP9_RV_endo
## ENSECAG00000013298          305             309         233          248
## ENSECAG00000019402            0               0           0            0
## ENSECAG00000052423         1517             993         876          821
## ENSECAG00000011978           34              28          22           21
## ENSECAG00000005166          449             306         334          252
## ENSECAG00000032916          722             556         491          423
##                    San_Diego_LA_post San_Diego_LVFW San_Diego_RA_post
## ENSECAG00000013298               281            295               319
## ENSECAG00000019402                 0              0                 0
## ENSECAG00000052423               931           1108              1098
## ENSECAG00000011978                 5             12                12
## ENSECAG00000005166               245            288               256
## ENSECAG00000032916               527            551               669
##                    San_Diego_RVFW Styled_LA_post Styled_LVFW Styled_RA_post
## ENSECAG00000013298            298            179         200            260
## ENSECAG00000019402              0              0           0              0
## ENSECAG00000052423           1002           1089        1151           1195
## ENSECAG00000011978              6              8          10              5
## ENSECAG00000005166            280            222         185            312
## ENSECAG00000032916            409            673         497            641
##                    Styled_RVFW
## ENSECAG00000013298         335
## ENSECAG00000019402           0
## ENSECAG00000052423        1058
## ENSECAG00000011978          18
## ENSECAG00000005166         314
## ENSECAG00000032916         463

3 Differential expression analysis for multilevel designs

3.1 Read count matrix

# Remove version numbers from gene names if present (e.g., "Gene.1" -> "Gene").
rownames(count) <- stringr::str_split_fixed(string = rownames(count), 
                                        pattern = '[.]',
                                        n = 2)[,1]

# Reorder metadata and annotation tables to match the column order of the count matrix.
column_order <- names(count)
meta_reordered <- meta[column_order, , drop = FALSE]

annot_order <- rownames(count)
annot_reordered <- annot[annot_order, ]

# Create a DGEList object for `edgeR` analysis.
# The DGEList object contains the count matrix, gene annotation, and sample metadata.
d <- DGEList(counts = count, genes = annot_reordered, samples = meta_reordered)

3.2 Filtering

# Define the design matrix based on the experimental conditions
# This design matrix will help retain genes that are expressed in at least one condition
design <- model.matrix(~0 + Condition , d$samples)
colnames(design) <- gsub("Condition", "", colnames(design))
design
##                       LA_AF LA_Control LV_AF LV_Control RA_AF RA_Control RV_AF
## Dorado_LA_post            1          0     0          0     0          0     0
## Dorado_LVFW               0          0     1          0     0          0     0
## Dorado_RA_post            0          0     0          0     1          0     0
## Dorado_RVFW               0          0     0          0     0          0     1
## Im_A_Mets_Fan_LA_post     1          0     0          0     0          0     0
## Im_A_Mets_Fan_LVFW        0          0     1          0     0          0     0
## Im_a_Mets_Fan_RA_post     0          0     0          0     1          0     0
## Im_a_Mets_Fan_RVFW        0          0     0          0     0          0     1
## Jytte_LA_post             1          0     0          0     0          0     0
## Jytte_LVFW                0          0     1          0     0          0     0
## Jytte_RA_post             0          0     0          0     1          0     0
## Jytte_RVFW                0          0     0          0     0          0     1
## Kevin_Cook_LA_post        1          0     0          0     0          0     0
## Kevin_Cook_LVFW           0          0     1          0     0          0     0
## Kevin_Cook_RA_post        0          0     0          0     1          0     0
## Kevin_Cook_RVFW           0          0     0          0     0          0     1
## MAP11_LA_post             0          1     0          0     0          0     0
## MAP11_LV_midwall          0          0     0          1     0          0     0
## MAP11_RApost              0          0     0          0     0          1     0
## MAP11_RV_endo             0          0     0          0     0          0     0
## MAP3_LA_post              0          1     0          0     0          0     0
## MAP3_LV_midwall           0          0     0          1     0          0     0
## MAP3_RApost               0          0     0          0     0          1     0
## MAP3_RV_endo              0          0     0          0     0          0     0
## MAP4_LA_post              0          1     0          0     0          0     0
## MAP4_LV_midwall           0          0     0          1     0          0     0
## MAP4_RApost               0          0     0          0     0          1     0
## MAP4_RV_endo              0          0     0          0     0          0     0
## MAP6_LA_post              0          1     0          0     0          0     0
## MAP6_LV_midwall           0          0     0          1     0          0     0
## MAP6_RApost               0          0     0          0     0          1     0
## MAP6_RV_endo              0          0     0          0     0          0     0
## MAP8_LA_post              0          1     0          0     0          0     0
## MAP8_RV_endo              0          0     0          0     0          0     0
## MAP8_RApost               0          0     0          0     0          1     0
## MAP8_LV_midwall           0          0     0          1     0          0     0
## MAP9_LA_post              0          1     0          0     0          0     0
## MAP9_LV_midwall           0          0     0          1     0          0     0
## MAP9_RApost               0          0     0          0     0          1     0
## MAP9_RV_endo              0          0     0          0     0          0     0
## San_Diego_LA_post         1          0     0          0     0          0     0
## San_Diego_LVFW            0          0     1          0     0          0     0
## San_Diego_RA_post         0          0     0          0     1          0     0
## San_Diego_RVFW            0          0     0          0     0          0     1
## Styled_LA_post            1          0     0          0     0          0     0
## Styled_LVFW               0          0     1          0     0          0     0
## Styled_RA_post            0          0     0          0     1          0     0
## Styled_RVFW               0          0     0          0     0          0     1
##                       RV_Control
## Dorado_LA_post                 0
## Dorado_LVFW                    0
## Dorado_RA_post                 0
## Dorado_RVFW                    0
## Im_A_Mets_Fan_LA_post          0
## Im_A_Mets_Fan_LVFW             0
## Im_a_Mets_Fan_RA_post          0
## Im_a_Mets_Fan_RVFW             0
## Jytte_LA_post                  0
## Jytte_LVFW                     0
## Jytte_RA_post                  0
## Jytte_RVFW                     0
## Kevin_Cook_LA_post             0
## Kevin_Cook_LVFW                0
## Kevin_Cook_RA_post             0
## Kevin_Cook_RVFW                0
## MAP11_LA_post                  0
## MAP11_LV_midwall               0
## MAP11_RApost                   0
## MAP11_RV_endo                  1
## MAP3_LA_post                   0
## MAP3_LV_midwall                0
## MAP3_RApost                    0
## MAP3_RV_endo                   1
## MAP4_LA_post                   0
## MAP4_LV_midwall                0
## MAP4_RApost                    0
## MAP4_RV_endo                   1
## MAP6_LA_post                   0
## MAP6_LV_midwall                0
## MAP6_RApost                    0
## MAP6_RV_endo                   1
## MAP8_LA_post                   0
## MAP8_RV_endo                   1
## MAP8_RApost                    0
## MAP8_LV_midwall                0
## MAP9_LA_post                   0
## MAP9_LV_midwall                0
## MAP9_RApost                    0
## MAP9_RV_endo                   1
## San_Diego_LA_post              0
## San_Diego_LVFW                 0
## San_Diego_RA_post              0
## San_Diego_RVFW                 0
## Styled_LA_post                 0
## Styled_LVFW                    0
## Styled_RA_post                 0
## Styled_RVFW                    0
## attr(,"assign")
## [1] 1 1 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Condition
## [1] "contr.treatment"
keep <- filterByExpr(d, design = design)
table(keep)
## keep
## FALSE  TRUE 
## 24339 14510
# Create a new DGEList object with only the retained genes
y <- d[keep, ,keep.lib.sizes=FALSE]

# Summary of the filtered DGEList object
cat("Number of genes & samples:", dim(y)[1], "&", dim(y)[2], "\n")
## Number of genes & samples: 14510 & 48

3.3 Normalization

# calculate normalization factors (TMM normalization of library size using edgeR)
y <- calcNormFactors(y)

# create design matrix
design <- model.matrix(~0 + Condition , y$samples)
colnames(design) <- gsub("Condition", "", colnames(design))
design
##                       LA_AF LA_Control LV_AF LV_Control RA_AF RA_Control RV_AF
## Dorado_LA_post            1          0     0          0     0          0     0
## Dorado_LVFW               0          0     1          0     0          0     0
## Dorado_RA_post            0          0     0          0     1          0     0
## Dorado_RVFW               0          0     0          0     0          0     1
## Im_A_Mets_Fan_LA_post     1          0     0          0     0          0     0
## Im_A_Mets_Fan_LVFW        0          0     1          0     0          0     0
## Im_a_Mets_Fan_RA_post     0          0     0          0     1          0     0
## Im_a_Mets_Fan_RVFW        0          0     0          0     0          0     1
## Jytte_LA_post             1          0     0          0     0          0     0
## Jytte_LVFW                0          0     1          0     0          0     0
## Jytte_RA_post             0          0     0          0     1          0     0
## Jytte_RVFW                0          0     0          0     0          0     1
## Kevin_Cook_LA_post        1          0     0          0     0          0     0
## Kevin_Cook_LVFW           0          0     1          0     0          0     0
## Kevin_Cook_RA_post        0          0     0          0     1          0     0
## Kevin_Cook_RVFW           0          0     0          0     0          0     1
## MAP11_LA_post             0          1     0          0     0          0     0
## MAP11_LV_midwall          0          0     0          1     0          0     0
## MAP11_RApost              0          0     0          0     0          1     0
## MAP11_RV_endo             0          0     0          0     0          0     0
## MAP3_LA_post              0          1     0          0     0          0     0
## MAP3_LV_midwall           0          0     0          1     0          0     0
## MAP3_RApost               0          0     0          0     0          1     0
## MAP3_RV_endo              0          0     0          0     0          0     0
## MAP4_LA_post              0          1     0          0     0          0     0
## MAP4_LV_midwall           0          0     0          1     0          0     0
## MAP4_RApost               0          0     0          0     0          1     0
## MAP4_RV_endo              0          0     0          0     0          0     0
## MAP6_LA_post              0          1     0          0     0          0     0
## MAP6_LV_midwall           0          0     0          1     0          0     0
## MAP6_RApost               0          0     0          0     0          1     0
## MAP6_RV_endo              0          0     0          0     0          0     0
## MAP8_LA_post              0          1     0          0     0          0     0
## MAP8_RV_endo              0          0     0          0     0          0     0
## MAP8_RApost               0          0     0          0     0          1     0
## MAP8_LV_midwall           0          0     0          1     0          0     0
## MAP9_LA_post              0          1     0          0     0          0     0
## MAP9_LV_midwall           0          0     0          1     0          0     0
## MAP9_RApost               0          0     0          0     0          1     0
## MAP9_RV_endo              0          0     0          0     0          0     0
## San_Diego_LA_post         1          0     0          0     0          0     0
## San_Diego_LVFW            0          0     1          0     0          0     0
## San_Diego_RA_post         0          0     0          0     1          0     0
## San_Diego_RVFW            0          0     0          0     0          0     1
## Styled_LA_post            1          0     0          0     0          0     0
## Styled_LVFW               0          0     1          0     0          0     0
## Styled_RA_post            0          0     0          0     1          0     0
## Styled_RVFW               0          0     0          0     0          0     1
##                       RV_Control
## Dorado_LA_post                 0
## Dorado_LVFW                    0
## Dorado_RA_post                 0
## Dorado_RVFW                    0
## Im_A_Mets_Fan_LA_post          0
## Im_A_Mets_Fan_LVFW             0
## Im_a_Mets_Fan_RA_post          0
## Im_a_Mets_Fan_RVFW             0
## Jytte_LA_post                  0
## Jytte_LVFW                     0
## Jytte_RA_post                  0
## Jytte_RVFW                     0
## Kevin_Cook_LA_post             0
## Kevin_Cook_LVFW                0
## Kevin_Cook_RA_post             0
## Kevin_Cook_RVFW                0
## MAP11_LA_post                  0
## MAP11_LV_midwall               0
## MAP11_RApost                   0
## MAP11_RV_endo                  1
## MAP3_LA_post                   0
## MAP3_LV_midwall                0
## MAP3_RApost                    0
## MAP3_RV_endo                   1
## MAP4_LA_post                   0
## MAP4_LV_midwall                0
## MAP4_RApost                    0
## MAP4_RV_endo                   1
## MAP6_LA_post                   0
## MAP6_LV_midwall                0
## MAP6_RApost                    0
## MAP6_RV_endo                   1
## MAP8_LA_post                   0
## MAP8_RV_endo                   1
## MAP8_RApost                    0
## MAP8_LV_midwall                0
## MAP9_LA_post                   0
## MAP9_LV_midwall                0
## MAP9_RApost                    0
## MAP9_RV_endo                   1
## San_Diego_LA_post              0
## San_Diego_LVFW                 0
## San_Diego_RA_post              0
## San_Diego_RVFW                 0
## Styled_LA_post                 0
## Styled_LVFW                    0
## Styled_RA_post                 0
## Styled_RVFW                    0
## attr(,"assign")
## [1] 1 1 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Condition
## [1] "contr.treatment"
# estimate dispersions
y <- estimateDisp(y, design)

# normalized expression levels (log2-transformed version of TMM-normalized CPM-values)
CPM <- cpm(y)
logCPM <- cpm(y, log = TRUE)

# Save the logCPM matrix to a CSV file
# write.csv(logCPM, "../output/Log2_TMM_normalized_CPM_values.csv", row.names = TRUE)

3.3.1 Count distributions

# Transform the logCPM matrix into long format for ggplot2 visualization.
# This will allow each gene/sample pair to be plotted individually.
logCPM_melted <- data.table::melt(logCPM)
## Warning in melt.default(logCPM): The melt generic in data.table has been passed
## a matrix and will attempt to redirect to the relevant reshape2 method; please
## note that reshape2 is superseded and is no longer actively developed, and this
## redirection is now deprecated. To continue using melt methods from reshape2
## while both libraries are attached, e.g. melt.list, you can prepend the
## namespace, i.e. reshape2::melt(logCPM). In the next version, this warning will
## become an error.
# Rename columns for better readability and consistency.
# "Var1" and "Var2" refer to the dimensions of the matrix and are renamed to "Gene" and "Sample".
setnames(x = logCPM_melted, 
         old = c("Var1", "Var2", "value"),
         new = c("Gene", "Sample", "logCPM"))

# Add experimental conditions like "Group" or "Region" to each sample for coloring in the plots.
logCPM_melted <- data.table::merge.data.table(x = logCPM_melted, 
                                              y = data.table(Sample = rownames(meta), meta), 
                                              by = "Sample")

# Color the boxes based on the experimental condition to highlight potential differences.
ggplot(logCPM_melted, aes(x = Sample, y = logCPM)) + 
  geom_boxplot(aes(color = Condition)) +   
  theme_bw() +                             
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +  
  ggtitle("logCPM boxplots")             

# Create density plots of logCPM values for each sample.
# This visualization helps to assess the distribution of expression values and identify batch effects.
ggplot(logCPM_melted, aes(x = logCPM)) + 
  geom_density(aes(group = Sample, color = Condition)) +  
  theme_bw() +                                           
  ggtitle("logCPM density distributions")                

3.4 Dimension reduction

3.4.1 PCA Plot All horses (un-corrected)

# Define custom colors and shapes for PCA plots
condition_shapes <- c("Control" = 16, "AF" = 17)  # Circle for Control, Triangle for AF

# Perform PCA on the logCPM values
pca <- prcomp(t(logCPM))  # Transpose logCPM so that samples are in rows and genes are in columns

# Extract PCA scores
pca_scores <- as.data.frame(pca$x)  # PCA scores
pca_scores$Sample <- rownames(pca_scores)  # Add sample names

# Merge with metadata for plotting
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")

# Define dimensions for plotting
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC1", "PC3"), p3 = c("PC1", "PC4"))
pca_plot <- list()

# Without labels
for (i in seq_along(dims)){
  pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group")) +
    geom_point(size = 4, alpha = 0.8) +
    scale_color_manual(values = chamber_colors) +
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# With labels
for (i in seq_along(dims)){
  pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
    geom_point(size = 4, alpha = 0.8) +
    geom_text(vjust = -0.5, size = 3) +  # Add labels for each sample
    scale_color_manual(values = chamber_colors) +
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}

# Combine labeled plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

### Extract top loadings
# Define a function to extract the top contributing genes for each component
get_top_loadings <- function(pca_obj, pc_num, top_n = 10) {
  # Extract loadings for the specified component
  loadings <- pca_obj$rotation[, pc_num]
  
  # Get the top genes with highest absolute loadings
  top_genes <- sort(abs(loadings), decreasing = TRUE)[1:top_n]
  
  # Extract gene names and actual loading values for the top genes
  top_loadings <- data.frame(
    Gene = names(top_genes),         # Gene names
    Loading = loadings[names(top_genes)],  # Loading values
    PC = paste0("PC", pc_num)        # Principal Component identifier
  )
  
  return(top_loadings)
}
### Verdict: Batch effect relating to each individual horse contributing four samples

3.4.2 PCA Plot All samples - Batch effect removed - Figure 2

# Remove batch effect using `removeBatchEffect` before PCA
logCPM_batchRemoved <- removeBatchEffect(logCPM, batch = y$samples$Horse, design = design)
## Coefficients not estimable: batch11
## Warning: Partial NA coefficients for 14510 probe(s)
# Perform PCA on batch-corrected logCPM values
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))

# Extract PCA scores and add sample information
pca_scores <- as.data.frame(pca_batchRemoved$x)  
pca_scores$Sample <- rownames(pca_scores)  

# Merge PCA scores with metadata
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")

# Define dimensions for PCA plotting
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC2", "PC3"), p3 = c("PC1", "PC4"))
pca_plot <- list()

# Create PCA plots without labels
for (i in seq_along(dims)){
  pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group")) +
    geom_point(size = 4, alpha = 0.8) +
    scale_color_manual(values = chamber_colors) +
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}

# Combine PCA plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# Create PCA plots with labels for each sample
for (i in seq_along(dims)){
  pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
    geom_point(size = 4, alpha = 0.8) +
    geom_text(vjust = -0.5, size = 3) +  # Add labels for each sample
    scale_color_manual(values = chamber_colors) +
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}

# Combine labeled PCA plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')

# Find top loadings
## Function to get all loadings for specified principal components and save to one file
extract_all_loadings <- function(pca_obj, pcs = 1:6, annot) {
  # Extract all loadings for the specified principal components
  all_loadings <- pca_obj$rotation[, pcs, drop = FALSE]
  
  # Convert to a data frame for easy manipulation
  loadings_df <- as.data.frame(all_loadings)
  
  # Add ENSEMBL IDs as a column
  loadings_df$ENSEMBL <- rownames(loadings_df)
  
  # Merge with annotation to include gene names
  loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
  
  # Rearrange columns to have GENENAME and ENSEMBL first for clarity
  loadings_df <- loadings_df[, c("GENENAME", "ENSEMBL", colnames(loadings_df)[1:(ncol(loadings_df) - 2)])]
  
  return(loadings_df)
}

# Extract loadings for PC1 to PC6
all_loadings_df <- extract_all_loadings(pca_obj = pca_batchRemoved, pcs = 1:6, annot = annot_reordered)

# Save the combined loadings to a single CSV file
write.csv(all_loadings_df, file = "../output/All_Loadings_PC1_to_PC6.csv", row.names = FALSE)

# Preview the combined loadings
head(all_loadings_df)
##   GENENAME            ENSEMBL          ENSEMBL.1           PC1           PC2
## 1    MTPAP ENSECAG00000000001 ENSECAG00000000001  0.0021249993 -3.761005e-04
## 2   R3HDM4 ENSECAG00000000003 ENSECAG00000000003  0.0011267619 -4.816464e-03
## 3   RNF170 ENSECAG00000000004 ENSECAG00000000004  0.0007005666  4.874067e-03
## 4   TMEM70 ENSECAG00000000007 ENSECAG00000000007  0.0077844405 -2.652240e-06
## 5    RPLP2 ENSECAG00000000008 ENSECAG00000000008 -0.0023290581 -5.216934e-03
## 6   LRRC58 ENSECAG00000000010 ENSECAG00000000010 -0.0052616115  3.010118e-03
##             PC3           PC4          PC5
## 1 -5.083679e-03 -0.0003394471 -0.002158191
## 2  5.224774e-04  0.0014866072 -0.001906639
## 3 -2.923870e-06  0.0002625384 -0.000943564
## 4  4.748882e-03 -0.0030212221 -0.004118943
## 5  1.725059e-02 -0.0110220681 -0.002596778
## 6 -1.455616e-03  0.0045966671  0.007107533
# Source the helper functions
source("pca_helpers_all_samples.R")

# Function to extract loadings based on highlighted genes
extract_and_map_loadings <- function(pca_obj, gene_list, annot) {
  ensembl_ids <- annot$ENSEMBL[match(gene_list, annot$GENENAME)]
  ensembl_ids <- ensembl_ids[!is.na(ensembl_ids)]
  valid_ids <- intersect(ensembl_ids, rownames(pca_obj$rotation))

  if (length(valid_ids) == 0) {
    warning("No matching ENSEMBL IDs found in PCA loadings.")
    return(NULL)
  }

  loadings_df <- as.data.frame(pca_obj$rotation[valid_ids, , drop = FALSE])
  loadings_df$ENSEMBL <- rownames(loadings_df)
  loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
  
  return(loadings_df)
}

### FOR PC1 & PC2

# Highlighted genes for PCA loadings extraction
highlighted_genes <- c("SCD5", "PITX2", "KCNK3", "BMP10", "KCVN2", "IRX2", "NR4A3", "MYH7")
specific_loadings_df <- extract_and_map_loadings(pca_batchRemoved, highlighted_genes, annot_reordered)
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))
pca_scores <- as.data.frame(pca_batchRemoved$x)  # PCA scores
pca_scores$Sample <- rownames(pca_scores)  # Add sample names
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")

# Generate PCA plots without and with labels
dimensions <- list(c("PC1", "PC2"))

# Generate and display PCA plots for all samples (both healthy and disease groups)
generate_and_save_pca_plots(
  pca_df = pca_scores,
  loadings_df = specific_loadings_df,
  dimensions = dimensions,
  colors = chamber_colors,
  shapes = condition_shapes,  # Include shapes for control/disease
  add_ellipse = FALSE,  # Optionally toggle to TRUE if ellipses are needed
  shape_var = "Group"  # Use Group as the shape variable
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

### FOR PC2 & PC3, PC1 & PC4

# Highlighted genes for PCA loadings extraction
highlighted_genes <- c("NPPA", "KCNJ5", "KCNK3", "NPPB", "MYL7", "KCNV2", "PITX2", "BMP10", 
                       "IRX4", "IRX5", "IRX3", "KCNJ2", "SCD5")
specific_loadings_df <- extract_and_map_loadings(pca_batchRemoved, highlighted_genes, annot_reordered)
print(specific_loadings_df)
##               ENSEMBL         PC1          PC2          PC3          PC4
## 1  ENSECAG00000007670 -0.05227401 -0.039228131  0.121752401  0.197294927
## 2  ENSECAG00000007922  0.05444175  0.006931374 -0.018064339 -0.016126856
## 3  ENSECAG00000008407 -0.06542148  0.016219977 -0.050015231 -0.065636026
## 4  ENSECAG00000014282 -0.07571580  0.008074537  0.085644536 -0.040248734
## 5  ENSECAG00000014698 -0.01426309  0.078253440 -0.021441142  0.035740370
## 6  ENSECAG00000014892 -0.13160409 -0.010678639  0.031471107 -0.030311520
## 7  ENSECAG00000016602 -0.07321082  0.006809435 -0.047207178 -0.100039607
## 8  ENSECAG00000017403  0.07276413  0.008245171  0.006803378 -0.003054911
## 9  ENSECAG00000018912 -0.07433800 -0.004520907 -0.009058442 -0.004476423
## 10 ENSECAG00000023496  0.06668556  0.011300322  0.031639029  0.035227351
## 11 ENSECAG00000023545  0.07361160  0.009361508  0.004299142  0.003657962
## 12 ENSECAG00000025007 -0.10905821 -0.009476101 -0.012414520 -0.002759771
## 13 ENSECAG00000038775 -0.07815285 -0.003381890 -0.024213114 -0.022184192
##             PC5          PC6           PC7           PC8          PC9
## 1  -0.034517979 -0.105035693  0.0335415778  2.339717e-03 -0.050400558
## 2  -0.024091972  0.002825550  0.0167563366 -1.699736e-02  0.009059961
## 3  -0.009609884  0.008048334 -0.0196856531 -2.058912e-02  0.027077607
## 4   0.046901363  0.119284403  0.0245399524 -3.206733e-03  0.037957597
## 5  -0.013571135 -0.017373316 -0.0067161956  2.853471e-02  0.002363195
## 6   0.052735794  0.034702499  0.0405253381 -2.819983e-02  0.017444449
## 7   0.021847510  0.037233077 -0.0003363777 -3.863412e-02 -0.035901694
## 8  -0.003377872  0.024508447  0.0034894687  2.130012e-03  0.012685469
## 9  -0.022211013  0.016516799  0.0115826854  1.070533e-02 -0.006952828
## 10 -0.006385543 -0.009435043  0.0281000028 -2.057182e-03  0.020087719
## 11  0.005779000  0.011568931  0.0089681269 -2.026065e-04  0.020113590
## 12  0.012384525 -0.031183442  0.0036119556  1.845239e-05  0.019176946
## 13 -0.014226770 -0.038084658  0.0230052610  1.230566e-02 -0.017696928
##             PC10          PC11          PC12          PC13          PC14
## 1  -0.0406682010 -0.0478721050 -0.0210383924 -0.0062902000 -0.0109909121
## 2  -0.0009258402  0.0032837035 -0.0034883619  0.0030770604 -0.0083290567
## 3   0.0222326443  0.0456264398 -0.0559828397  0.0208894431  0.0113645351
## 4  -0.0036575229 -0.0263514299  0.0331283604 -0.0114431434  0.0241195224
## 5  -0.0344942055 -0.0043542906  0.0002893075 -0.0097573266 -0.0069985513
## 6   0.0035872669 -0.0407175466 -0.0159527793 -0.0169374856 -0.0055278893
## 7  -0.0041344181  0.0075445749  0.0412638917 -0.0004439656 -0.0133117530
## 8   0.0045333676  0.0031444501  0.0009855603  0.0034574046 -0.0062771485
## 9  -0.0210436549  0.0005091515 -0.0100681846 -0.0131162841 -0.0306323267
## 10 -0.0051524854 -0.0229096684  0.0228149929  0.0002942752  0.0122788597
## 11  0.0139444146  0.0015228059  0.0190142937 -0.0003578966 -0.0004129403
## 12 -0.0213580555 -0.0270250041 -0.0492959181  0.0004178917  0.0192838555
## 13 -0.0076833884 -0.0057037608  0.0110135729 -0.0049312236 -0.0089598505
##             PC15         PC16         PC17          PC18          PC19
## 1   0.0844996231 -0.022799122 -0.052544404  1.526425e-02 -0.0481451338
## 2  -0.0232162905  0.006346367 -0.024314059  1.606317e-02 -0.0137897135
## 3   0.0033673776  0.001489856  0.055077502  1.176700e-02  0.0116164146
## 4   0.0765147147 -0.049093865 -0.092422580  1.638500e-02 -0.0870742840
## 5  -0.0296014639  0.017422842 -0.017792195 -3.174248e-02 -0.0047453487
## 6   0.0386653985 -0.007406411 -0.055815696 -4.169079e-02 -0.0156626034
## 7   0.0359420540 -0.025129301  0.044267877  5.754526e-02 -0.0437472351
## 8  -0.0173147721  0.008564201 -0.011969621  1.376638e-02 -0.0197647394
## 9   0.0104503209  0.045897760  0.046648701  7.647692e-03 -0.0206674502
## 10  0.0009862748 -0.010919098 -0.013678301  7.264628e-05 -0.0215725804
## 11 -0.0045916253  0.005491875 -0.007974125  1.403380e-03 -0.0076793106
## 12  0.0184696724 -0.002795138  0.012268096 -1.332122e-02 -0.0003808918
## 13  0.0113647277 -0.003376706  0.008903726 -1.095808e-03  0.0010683520
##            PC20          PC21         PC22         PC23          PC24
## 1   0.008567295  0.0302295414 -0.007939727 -0.014787966 -7.554412e-02
## 2   0.023617679  0.0132745607  0.029442843 -0.008286473  4.215635e-03
## 3   0.029170852  0.0306411189 -0.012015233  0.015105768  1.153044e-02
## 4   0.021252825  0.0366026327  0.029951521  0.046386657 -5.406510e-03
## 5   0.017830116 -0.0155500570 -0.013883506 -0.010305925 -1.236698e-03
## 6   0.016287730 -0.0055569385 -0.012478637  0.035907356 -3.230011e-02
## 7   0.075896695 -0.0078088451 -0.055552710 -0.006321799 -3.794545e-02
## 8  -0.006002097 -0.0064265657 -0.001602502 -0.002206307  4.516589e-05
## 9   0.020273897 -0.0007999865 -0.027350136  0.021036989 -6.709532e-03
## 10  0.007441009 -0.0003734871  0.004379471  0.003845154  1.153018e-02
## 11 -0.013762579 -0.0111086607  0.016008915  0.009729761  1.167720e-02
## 12  0.009580203  0.0297791220 -0.007373942 -0.016707403 -5.573077e-03
## 13 -0.016331578  0.0079938043 -0.005010974 -0.015417255 -5.654312e-03
##             PC25         PC26          PC27         PC28          PC29
## 1  -0.0205663975  0.014295363 -0.0087117156 -0.005819478  0.0180242568
## 2   0.0237773847  0.005400292  0.0093635200  0.002372151 -0.0019047291
## 3  -0.0145585464  0.009824951 -0.0035725748 -0.002156993  0.0279208497
## 4  -0.0201255986  0.053814428  0.0288566341  0.006196124  0.0002587613
## 5  -0.0126381608 -0.016300043  0.0180666819  0.003694298 -0.0163138720
## 6   0.0110771342  0.028460193  0.0260018302 -0.012432077 -0.0014149188
## 7  -0.0079037742  0.015396341  0.0007386441 -0.006959592  0.0087293711
## 8   0.0060093280  0.009904557  0.0085054539 -0.002073155 -0.0112026564
## 9  -0.0233336255 -0.009082222  0.0274029416  0.006530430 -0.0127174215
## 10 -0.0178076440  0.006702277  0.0091208121 -0.004271487 -0.0100109031
## 11  0.0005913259  0.002650115  0.0135145302  0.000128555 -0.0051144449
## 12 -0.0036590888 -0.037978342 -0.0144735318 -0.011264403 -0.0138719753
## 13  0.0044095005 -0.016957107 -0.0108380786 -0.002016082 -0.0051381850
##             PC30          PC31          PC32         PC33          PC34
## 1   2.647413e-02  0.0395826679  0.0078979940 -0.018056867 -0.0193066792
## 2  -5.068716e-03  0.0141736564 -0.0061971533  0.003554042 -0.0155269909
## 3   3.649226e-03  0.0061833439 -0.0269071426  0.023478244 -0.0002811612
## 4   2.981332e-03  0.0141465915  0.0060626054 -0.017997391  0.0196690499
## 5   3.287966e-03 -0.0033201920  0.0004104974 -0.006914804 -0.0018425302
## 6  -1.894172e-02 -0.0132958103  0.0102201596 -0.026508146 -0.0066956467
## 7  -5.129332e-04  0.0148340341 -0.0246611495  0.029082143 -0.0086514211
## 8  -3.898721e-03 -0.0009469725 -0.0016352811 -0.001373161 -0.0040737581
## 9   6.113522e-05 -0.0071506911  0.0153339779 -0.025660998  0.0252363176
## 10  1.970222e-03  0.0131020377  0.0121681664 -0.004870636 -0.0044985051
## 11  5.329238e-03  0.0049612414 -0.0069427304  0.007120660 -0.0015881873
## 12  1.744893e-02 -0.0040130015  0.0020424721  0.021831175  0.0113678949
## 13  1.262867e-02 -0.0015129147  0.0007094371  0.003424982  0.0162941417
##            PC35          PC36          PC37          PC38          PC39
## 1   0.002543571 -0.0117022055  0.0194024332 -0.0020501829  0.0073637191
## 2   0.002958825 -0.0077399371  0.0178359947  0.0065947068 -0.0010071189
## 3  -0.008724012  0.0045279034  0.0005077024 -0.0051548740 -0.0028603585
## 4  -0.024182442 -0.0427127628  0.0212038159  0.0110237900 -0.0012328612
## 5   0.020355398 -0.0008293688 -0.0010942205  0.0028768728  0.0048495994
## 6  -0.033355870 -0.0329966426 -0.0221282768  0.0143774401  0.0005603990
## 7  -0.014124847  0.0144513606 -0.0097817597  0.0009869065  0.0075041605
## 8  -0.003831058 -0.0056278318 -0.0012771670  0.0004016718 -0.0007627094
## 9  -0.006863702  0.0426738753  0.0394846895  0.0216536041 -0.0034623673
## 10  0.005676438  0.0002761843  0.0020246969  0.0009825398 -0.0009557223
## 11 -0.012430614 -0.0018737929 -0.0015530616  0.0011662163  0.0007436669
## 12 -0.022542480 -0.0293402778 -0.0205176657  0.0007955571  0.0033599140
## 13 -0.004238412  0.0093505483 -0.0037253126  0.0056077852  0.0072692182
##             PC40          PC41          PC42          PC43          PC44
## 1   0.0027995035 -0.0025804007  0.0048109152  0.0019067998 -2.724551e-03
## 2  -0.0016406878 -0.0023061972  0.0066992635  0.0027350809  3.098881e-03
## 3  -0.0001642517 -0.0048118442 -0.0056442701 -0.0061145259  8.431813e-04
## 4  -0.0034592852  0.0095510327  0.0016660918  0.0024528068  2.041669e-03
## 5   0.0008327220  0.0003289098  0.0051658690  0.0005924205  2.859167e-05
## 6  -0.0058885419  0.0044003815 -0.0010219476 -0.0064551824  3.851411e-03
## 7   0.0005482449 -0.0001455944 -0.0065028808 -0.0080517266 -7.295247e-03
## 8   0.0003770352  0.0021081403  0.0010751785  0.0020354963 -6.328756e-04
## 9  -0.0085429206 -0.0009096319  0.0205239598 -0.0029709711  4.134153e-03
## 10 -0.0004060854 -0.0005120010  0.0001754743  0.0046099293 -2.204789e-03
## 11  0.0005109423 -0.0025078151  0.0021326524 -0.0008072663 -1.067690e-03
## 12 -0.0003787278 -0.0017265263  0.0060458352  0.0014307225  7.490251e-03
## 13  0.0018977171 -0.0042130539  0.0039727563 -0.0034226730  6.030265e-03
##             PC45          PC46          PC47         PC48 GENENAME
## 1   0.0101801552 -0.0041970778 -4.504517e-03 -0.010356680    BMP10
## 2  -0.0099542719 -0.0045540547  1.036295e-03  0.006684751    KCNJ2
## 3  -0.0007933573  0.0008043687 -3.493117e-03 -0.001027548    PITX2
## 4  -0.0056645798  0.0076371832 -9.791100e-04  0.004787730     NPPB
## 5  -0.0068101493  0.0053138297 -1.765072e-02  0.004397189     SCD5
## 6  -0.0041321545  0.0005382408  5.461900e-04 -0.008428587     NPPA
## 7  -0.0009958840 -0.0032948479  7.587697e-04 -0.003991923    KCNV2
## 8  -0.0015659203  0.0022462584 -3.595410e-03  0.003209435     IRX4
## 9  -0.0125983413 -0.0070993250  3.789036e-04  0.012431994     MYL7
## 10 -0.0006508715  0.0057216033  3.520730e-03  0.010179654     IRX3
## 11 -0.0023433898 -0.0067091561  1.290452e-05  0.004801349     IRX5
## 12 -0.0019563549 -0.0052078546  2.882416e-03  0.007880405    KCNJ5
## 13 -0.0014009894 -0.0114040492  2.110588e-03  0.006649243    KCNK3
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))
pca_scores <- as.data.frame(pca_batchRemoved$x)  # PCA scores
pca_scores$Sample <- rownames(pca_scores)  # Add sample names
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")

dimensions <- list(c("PC1", "PC3"), c("PC1", "PC4"))
generate_and_save_pca_plots(
  pca_df = pca_scores,
  loadings_df = specific_loadings_df,
  dimensions = dimensions,
  colors = chamber_colors,
  shapes = condition_shapes,  # Include shapes for control/disease
  add_ellipse = FALSE,  # Optionally toggle to TRUE if ellipses are needed
  shape_var = "Group"  # Use Group as the shape variable
)

### Extract variance explained by each component
# Extract the standard deviation of each principal component
pca_sdev <- pca_batchRemoved$sdev

# Calculate the variance explained by each component
variance_explained <- (pca_sdev^2) / sum(pca_sdev^2)

# Convert to a data frame for easier viewing
variance_explained_df <- data.frame(
  Principal_Component = paste0("PC", 1:length(variance_explained)),
  Variance_Explained = variance_explained,
  Cumulative_Variance = cumsum(variance_explained)
)

# Print variance explained by each component
print(variance_explained_df)
##    Principal_Component Variance_Explained Cumulative_Variance
## 1                  PC1       3.480855e-01           0.3480855
## 2                  PC2       2.934088e-01           0.6414942
## 3                  PC3       4.140402e-02           0.6828982
## 4                  PC4       3.494612e-02           0.7178444
## 5                  PC5       2.789915e-02           0.7457435
## 6                  PC6       2.438526e-02           0.7701288
## 7                  PC7       2.076548e-02           0.7908943
## 8                  PC8       1.893908e-02           0.8098333
## 9                  PC9       1.549361e-02           0.8253269
## 10                PC10       1.438969e-02           0.8397166
## 11                PC11       1.392454e-02           0.8536412
## 12                PC12       1.033311e-02           0.8639743
## 13                PC13       9.470018e-03           0.8734443
## 14                PC14       8.604413e-03           0.8820487
## 15                PC15       7.956783e-03           0.8900055
## 16                PC16       7.552653e-03           0.8975581
## 17                PC17       7.105511e-03           0.9046637
## 18                PC18       6.700273e-03           0.9113639
## 19                PC19       6.488442e-03           0.9178524
## 20                PC20       5.906215e-03           0.9237586
## 21                PC21       5.822941e-03           0.9295815
## 22                PC22       5.438269e-03           0.9350198
## 23                PC23       5.286348e-03           0.9403061
## 24                PC24       5.042680e-03           0.9453488
## 25                PC25       4.986711e-03           0.9503355
## 26                PC26       4.737013e-03           0.9550725
## 27                PC27       4.689348e-03           0.9597619
## 28                PC28       4.598649e-03           0.9643605
## 29                PC29       4.423403e-03           0.9687839
## 30                PC30       4.283260e-03           0.9730672
## 31                PC31       4.153394e-03           0.9772206
## 32                PC32       4.056323e-03           0.9812769
## 33                PC33       3.922411e-03           0.9851993
## 34                PC34       3.861792e-03           0.9890611
## 35                PC35       3.757164e-03           0.9928183
## 36                PC36       3.652218e-03           0.9964705
## 37                PC37       3.529492e-03           1.0000000
## 38                PC38       1.991836e-30           1.0000000
## 39                PC39       7.776743e-31           1.0000000
## 40                PC40       5.789392e-31           1.0000000
## 41                PC41       4.999409e-31           1.0000000
## 42                PC42       2.148111e-31           1.0000000
## 43                PC43       1.925285e-31           1.0000000
## 44                PC44       1.196867e-31           1.0000000
## 45                PC45       1.145573e-31           1.0000000
## 46                PC46       7.838796e-32           1.0000000
## 47                PC47       5.926737e-32           1.0000000
## 48                PC48       5.909015e-32           1.0000000

3.4.3 Subset to only healthy horses for figure 1

# Subset the metadata to include only "Control" horses
healthy_meta <- meta[meta$Group == "Control", ]

# Subset the count matrix to include only columns (samples) of healthy horses
healthy_samples <- rownames(healthy_meta)  # Get sample names of healthy horses
logCPM_healthy <- logCPM[, healthy_samples]  # Subset the logCPM matrix

# Use removeBatchEffect with healthy data only
logCPM_healthy_batchRemoved <- removeBatchEffect(logCPM_healthy, 
                                                 batch = y$samples[healthy_samples, ]$Horse, 
                                                 design = design[healthy_samples, ])
## Coefficients not estimable: LA_AF LV_AF RA_AF RV_AF
## Warning: Partial NA coefficients for 14510 probe(s)
#Perform PCA on the healthy horses' logCPM values
pca_healthy <- prcomp(t(logCPM_healthy_batchRemoved))

# Extract PCA scores for healthy horses
pca_scores_healthy <- as.data.frame(pca_healthy$x)  # PCA scores for healthy horses
pca_scores_healthy$Sample <- rownames(pca_scores_healthy)  # Add sample names

# Merge with the healthy subset of metadata for plotting
pca_scores_healthy <- merge(pca_scores_healthy, healthy_meta, by.x = "Sample", by.y = "row.names")

# Define dimensions for PCA plotting 
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC1", "PC3"), p3 = c("PC2", "PC3"))
pca_plot_healthy <- list()

# Create PCA plots for healthy horses using predefined colors and shapes
for (i in seq_along(dims)){
  pca_plot_healthy[[i]] <- ggplot(pca_scores_healthy, aes_string(x = dims[[i]][1], y = dims[[i]][2], 
                                                                color = "Region", fill = "Region")) +
    geom_point(size = 4, alpha = 0.8, stroke = 1.5) +  # Adjusted to include border color
    scale_color_manual(values = chamber_colors) +      # Outer colors
    scale_fill_manual(values = chamber_fill_colors) +  # Fill colors
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA (Healthy Horses):", dims[[i]][1], "vs", dims[[i]][2]), 
         x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}
patchwork::wrap_plots(pca_plot_healthy, ncol = 2) + patchwork::plot_layout(guides = 'collect')
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's shape values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's shape values.
## No shared levels found between `names(values)` of the manual scale and the
## data's shape values.

# With labels
for (i in seq_along(dims)){
  pca_plot_healthy[[i]] <- ggplot(pca_scores_healthy, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
    geom_point(size = 4, alpha = 0.8) +
    geom_text(vjust = -0.5, size = 3) +  # Add labels for each sample
    scale_color_manual(values = chamber_colors) +
    scale_shape_manual(values = condition_shapes) +
    theme_minimal() +
    labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
    theme(legend.position = "right")
}

# Combine PCA plots for healthy horses using patchwork
patchwork::wrap_plots(pca_plot_healthy, ncol = 2) + patchwork::plot_layout(guides = 'collect')

3.4.3.1 Add Loadings

# PCA loadings highlight the genes contributing most to principal components, 
# helping identify key drivers of transcriptional variability; supplementing DGE results

# Source the helper functions
source("pca_helpers.R")

#Extract loadings for plot
# Unified function to extract specific loadings with gene names
extract_and_map_loadings <- function(pca_obj, gene_list, annot) {
  # Map gene symbols to ENSEMBL IDs and remove NAs
  ensembl_ids <- annot$ENSEMBL[match(gene_list, annot$GENENAME)]
  ensembl_ids <- ensembl_ids[!is.na(ensembl_ids)]

  # Extract loadings for the mapped ENSEMBL IDs present in PCA loadings
  valid_ids <- intersect(ensembl_ids, rownames(pca_obj$rotation))
  if (length(valid_ids) == 0) {
    warning("No matching ENSEMBL IDs found in PCA loadings.")
    return(NULL)
  }
  
  # Extract loadings and add gene names from annotation
  loadings_df <- as.data.frame(pca_obj$rotation[valid_ids, , drop = FALSE])
  loadings_df$ENSEMBL <- rownames(loadings_df)
  loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
  
  return(loadings_df)
}

# Example gene list for loadings extraction (modify as needed)
highlighted_genes <- c("BMP10", "NPPA", "NPPB", "KCNJ5", "KCNK3", "MYL7", "KCNV2", "PCDH7", "KIF18B")

# Extract loadings with unified function
specific_loadings_df <- extract_and_map_loadings(pca_healthy, highlighted_genes, annot_reordered)

# Check results
print(specific_loadings_df)
##              ENSEMBL         PC1         PC2          PC3          PC4
## 1 ENSECAG00000000599  0.04128203 -0.07121482  0.030304906 -0.032071230
## 2 ENSECAG00000007670 -0.04187792 -0.05344778 -0.182171372  0.006027121
## 3 ENSECAG00000014282 -0.04908616 -0.10659291  0.118463723 -0.002990045
## 4 ENSECAG00000014892 -0.10947283 -0.02092027  0.062474966  0.003249863
## 5 ENSECAG00000016602 -0.07236999  0.03013959  0.085731061 -0.050952951
## 6 ENSECAG00000018912 -0.07944678 -0.01610544  0.011473745 -0.012164607
## 7 ENSECAG00000023019 -0.03526124  0.08195650 -0.030860436 -0.030426407
## 8 ENSECAG00000025007 -0.10372285  0.02219651 -0.008121261  0.038637045
## 9 ENSECAG00000038775 -0.08268097  0.04205286  0.000986836 -0.013644879
##            PC5          PC6           PC7          PC8          PC9
## 1  0.005743619 -0.009509976  0.0004020223 -0.001940592  0.006663117
## 2 -0.126553308  0.041239220 -0.0610635273  0.067588478  0.009910526
## 3 -0.033062846 -0.035715429 -0.1216359789  0.055189123  0.049599924
## 4 -0.020083032 -0.037107553 -0.0955982451 -0.004075129  0.014493231
## 5 -0.056147648 -0.020596374 -0.0028262312  0.027970752  0.064543567
## 6 -0.013894399  0.019483412 -0.0068119499  0.021592442  0.014527742
## 7 -0.026246107  0.007321278  0.0335366847 -0.027098257 -0.023158184
## 8 -0.007536985  0.026516548 -0.0133554256  0.023942239 -0.036670701
## 9 -0.008717469  0.012687793  0.0082786036  0.006067485 -0.012061662
##           PC10         PC11          PC12          PC13          PC14
## 1 -0.014348359 -0.029720120 -0.0029925903  6.080895e-03 -0.0008997756
## 2 -0.029649721 -0.017658428 -0.0786358849  4.052689e-02  0.0196434957
## 3 -0.009567100 -0.069878973 -0.0230841113 -4.260866e-05 -0.0087791247
## 4  0.002631524  0.001711544 -0.0096136133 -1.676756e-02 -0.0038826057
## 5  0.036220864 -0.005228765 -0.0258024831  3.651355e-02  0.0061294993
## 6  0.043432730 -0.007032016  0.0054679749 -1.133375e-02  0.0234860617
## 7 -0.007601022  0.008123758 -0.0050531598 -6.023305e-04  0.0049842649
## 8  0.006685247  0.023159496 -0.0046750647  2.163485e-02  0.0329738243
## 9 -0.001199439  0.015060327 -0.0001605864  1.585224e-02  0.0127654678
##            PC15         PC16          PC17         PC18         PC19
## 1  0.0012916731 -0.018634127  0.0002603693  0.001980064 -0.002660186
## 2  0.0435021885  0.022570455 -0.0185994510 -0.000264414  0.004494277
## 3  0.0017600062 -0.008754142  0.0338874444 -0.034024806 -0.017966932
## 4  0.0016642738 -0.026328777  0.0120437145 -0.040533169 -0.011141913
## 5  0.0269196032 -0.006978850 -0.0021779620 -0.002438107 -0.006965289
## 6 -0.0161986990 -0.009510093  0.0224553375  0.039771249 -0.006066325
## 7 -0.0049242688 -0.002195093  0.0259747386 -0.024210392  0.003487952
## 8  0.0008313835 -0.017779019  0.0025486425 -0.026666551 -0.004019149
## 9  0.0020570195 -0.004555134  0.0074710045  0.009732350 -0.005855678
##            PC20          PC21          PC22         PC23         PC24 GENENAME
## 1  0.0012394673 -0.0092387190 -0.0070551437 -0.000630666  0.002514286    PCDH7
## 2  0.0100987242 -0.0155825346 -0.0028594309 -0.020072186 -0.001655615    BMP10
## 3 -0.0199487981 -0.0167329406 -0.0060256662  0.005186767 -0.011830997     NPPB
## 4  0.0020671822 -0.0003647640 -0.0115427127  0.003982254  0.002701151     NPPA
## 5  0.0005184457 -0.0074043455 -0.0049424451 -0.005484195 -0.002987105    KCNV2
## 6 -0.0074759431  0.0008165151 -0.0162637746  0.017510515 -0.017375019     MYL7
## 7 -0.0078558964 -0.0058403386  0.0005389701 -0.006758917 -0.005072034   KIF18B
## 8  0.0001209625 -0.0034348392 -0.0029844971 -0.011504834  0.005912755    KCNJ5
## 9  0.0021067526 -0.0019574276 -0.0020470672 -0.001530989  0.002629899    KCNK3
# Example dimensions for PCA plotting
dimensions <- list(c("PC1", "PC2"), c("PC1", "PC3"), c("PC2", "PC3"))

# Generate and display the PCA plots with probability ellipses
generate_and_save_pca_plots(
  pca_df = pca_scores_healthy,
  loadings_df = specific_loadings_df,
  dimensions = dimensions,
  colors = chamber_colors,
  add_ellipse = FALSE  # Toggle to TRUE to include ellipses
)

### Extract variance explained by each component
# Extract the standard deviation of each principal component
pca_sdev <- pca_healthy$sdev

# Calculate the variance explained by each component
variance_explained <- (pca_sdev^2) / sum(pca_sdev^2)

# Convert to a data frame for easier viewing
variance_explained_df <- data.frame(
  Principal_Component = paste0("PC", 1:length(variance_explained)),
  Variance_Explained = variance_explained,
  Cumulative_Variance = cumsum(variance_explained)
)

# Print variance explained by each component
print(variance_explained_df)
##    Principal_Component Variance_Explained Cumulative_Variance
## 1                  PC1       5.697720e-01           0.5697720
## 2                  PC2       7.518616e-02           0.6449582
## 3                  PC3       6.488271e-02           0.7098409
## 4                  PC4       3.657129e-02           0.7464122
## 5                  PC5       3.515831e-02           0.7815705
## 6                  PC6       2.627400e-02           0.8078445
## 7                  PC7       2.211154e-02           0.8299560
## 8                  PC8       2.115299e-02           0.8511090
## 9                  PC9       1.938149e-02           0.8704905
## 10                PC10       1.756410e-02           0.8880546
## 11                PC11       1.653983e-02           0.9045944
## 12                PC12       1.603807e-02           0.9206325
## 13                PC13       1.497715e-02           0.9356097
## 14                PC14       1.429256e-02           0.9499022
## 15                PC15       1.346297e-02           0.9633652
## 16                PC16       1.249215e-02           0.9758573
## 17                PC17       1.222929e-02           0.9880866
## 18                PC18       1.191337e-02           1.0000000
## 19                PC19       1.227215e-30           1.0000000
## 20                PC20       5.261321e-31           1.0000000
## 21                PC21       3.357265e-31           1.0000000
## 22                PC22       3.294082e-31           1.0000000
## 23                PC23       2.519813e-31           1.0000000
## 24                PC24       1.340784e-31           1.0000000

3.5 Differential expression analysis

3.5.1 Create contrast for multilevel design

# create contrasts to test
colnames(design)
## [1] "LA_AF"      "LA_Control" "LV_AF"      "LV_Control" "RA_AF"     
## [6] "RA_Control" "RV_AF"      "RV_Control"
con <- makeContrasts(AF_vs_control_RA = RA_AF - RA_Control,
                     AF_vs_control_LA = LA_AF - LA_Control,
                     RA_vs_LA_control = RA_Control - LA_Control,
                     RA_vs_LA_AF = RA_AF - LA_AF,
                     InteractionEffect_Atrial = (RA_AF - RA_Control) - (LA_AF - LA_Control),
                     Avr.Dis.Effect_Atrial = (LA_AF + RA_AF)/2 - (LA_Control + RA_Control)/2,
                     AverageRegionEffect_Atrial = (RA_Control + RA_AF)/2 - (LA_Control + LA_AF)/2,
                     AF_vs_control_RV = RV_AF - RV_Control,
                     AF_vs_control_LV = LV_AF - LV_Control,
                     RV_vs_LV_control = RV_Control - LV_Control,
                     RV_vs_LV_AF = RV_AF - LV_AF,
                     InteractionEffect_Ventricular = (RV_AF - RV_Control) - (LV_AF - LV_Control),
                     Avr.Dis.Effect_Ventricular = (LV_AF + RV_AF)/2 - (LV_Control + RV_Control)/2,
                     AverageRegionEffect_Ventricular = (RV_Control + RV_AF)/2 - (LV_Control + LV_AF)/2,
                     AverageRegionEffect_AV_healthy = (RA_Control + LA_Control)/2 - (RV_Control + LV_Control)/2,
                     levels = design)
con
##             Contrasts
## Levels       AF_vs_control_RA AF_vs_control_LA RA_vs_LA_control RA_vs_LA_AF
##   LA_AF                     0                1                0          -1
##   LA_Control                0               -1               -1           0
##   LV_AF                     0                0                0           0
##   LV_Control                0                0                0           0
##   RA_AF                     1                0                0           1
##   RA_Control               -1                0                1           0
##   RV_AF                     0                0                0           0
##   RV_Control                0                0                0           0
##             Contrasts
## Levels       InteractionEffect_Atrial Avr.Dis.Effect_Atrial
##   LA_AF                            -1                   0.5
##   LA_Control                        1                  -0.5
##   LV_AF                             0                   0.0
##   LV_Control                        0                   0.0
##   RA_AF                             1                   0.5
##   RA_Control                       -1                  -0.5
##   RV_AF                             0                   0.0
##   RV_Control                        0                   0.0
##             Contrasts
## Levels       AverageRegionEffect_Atrial AF_vs_control_RV AF_vs_control_LV
##   LA_AF                            -0.5                0                0
##   LA_Control                       -0.5                0                0
##   LV_AF                             0.0                0                1
##   LV_Control                        0.0                0               -1
##   RA_AF                             0.5                0                0
##   RA_Control                        0.5                0                0
##   RV_AF                             0.0                1                0
##   RV_Control                        0.0               -1                0
##             Contrasts
## Levels       RV_vs_LV_control RV_vs_LV_AF InteractionEffect_Ventricular
##   LA_AF                     0           0                             0
##   LA_Control                0           0                             0
##   LV_AF                     0          -1                            -1
##   LV_Control               -1           0                             1
##   RA_AF                     0           0                             0
##   RA_Control                0           0                             0
##   RV_AF                     0           1                             1
##   RV_Control                1           0                            -1
##             Contrasts
## Levels       Avr.Dis.Effect_Ventricular AverageRegionEffect_Ventricular
##   LA_AF                             0.0                             0.0
##   LA_Control                        0.0                             0.0
##   LV_AF                             0.5                            -0.5
##   LV_Control                       -0.5                            -0.5
##   RA_AF                             0.0                             0.0
##   RA_Control                        0.0                             0.0
##   RV_AF                             0.5                             0.5
##   RV_Control                       -0.5                             0.5
##             Contrasts
## Levels       AverageRegionEffect_AV_healthy
##   LA_AF                                 0.0
##   LA_Control                            0.5
##   LV_AF                                 0.0
##   LV_Control                           -0.5
##   RA_AF                                 0.0
##   RA_Control                            0.5
##   RV_AF                                 0.0
##   RV_Control                           -0.5

3.5.2 Differential analysis - voomLmFit

# V is the result of my linear model fitted using voom transformation
y_raw <- d[keep, ,keep.lib.sizes=FALSE]

# We created the y_raw, but we will use our TMM-normalised data instead = run y instead

v <- voomLmFit(counts = y, 
               design = design, 
               block = as.factor(y$samples$Horse), 
               sample.weights = TRUE, 
               plot = TRUE) 
## First sample weights (min/max) 0.2817808/1.6829927
## First intra-block correlation  0.2269787

## Final sample weights (min/max) 0.2815015/1.6953994
## Final intra-block correlation  0.2269201
res <- list() # list for DGE results
for (i in colnames(con)) {
  fit <- contrasts.fit(v, contrasts = con)
  fit <- eBayes(fit, robust = TRUE)
  res[[i]] <- topTable(fit, coef = i, number = Inf)
  res[[i]] <- data.frame(res[[i]], Contrast = i)
  n <- res[[i]] %>% dplyr::filter(adj.P.Val < 0.05) %>% nrow 
  print(paste('number of DE genes for',i, '=', n))
}
## [1] "number of DE genes for AF_vs_control_RA = 1330"
## [1] "number of DE genes for AF_vs_control_LA = 714"
## [1] "number of DE genes for RA_vs_LA_control = 520"
## [1] "number of DE genes for RA_vs_LA_AF = 169"
## [1] "number of DE genes for InteractionEffect_Atrial = 0"
## [1] "number of DE genes for Avr.Dis.Effect_Atrial = 2076"
## [1] "number of DE genes for AverageRegionEffect_Atrial = 847"
## [1] "number of DE genes for AF_vs_control_RV = 268"
## [1] "number of DE genes for AF_vs_control_LV = 60"
## [1] "number of DE genes for RV_vs_LV_control = 1299"
## [1] "number of DE genes for RV_vs_LV_AF = 0"
## [1] "number of DE genes for InteractionEffect_Ventricular = 109"
## [1] "number of DE genes for Avr.Dis.Effect_Ventricular = 207"
## [1] "number of DE genes for AverageRegionEffect_Ventricular = 591"
## [1] "number of DE genes for AverageRegionEffect_AV_healthy = 7889"
res_all <- do.call(rbind, res)

# Create output Excel file
res_cleaned <- lapply(res, function(df) {
  names(df)[names(df) == "GeneName"] <- "Ensemblname"  # or remove it if redundant
  df
})
openxlsx::write.xlsx(x = res_cleaned, file = "../output/dge_results.xlsx", asTable = TRUE)


# Create output TSV file
data.table::fwrite(x = res_all, file = "../output/dge_results.tsv.gz", sep = "\t")

3.5.3 Diagnostics for differential analysis

3.5.3.1 p-value histograms

ggplot(res_all, aes(x = P.Value)) + 
  geom_histogram(fill = "lightgray",
                 color = "black",
                 breaks = seq(0, 1, by = 0.05),
                 closed = "right",
                 lwd = 0.2) + 
  facet_wrap(~ Contrast, nrow = 3, scales = "free") + 
  theme_bw()

3.5.4 Volcano plots

volcano_plots <- list()
for (i in names(res)){
  volcano_plots[[i]] <- ggVolcano(x = res[[i]], 
                                  fdr = 0.05,
                                  fdr.column = "adj.P.Val", 
                                  pvalue.column = "P.Value", 
                                  logFC = 0, 
                                  logFC.column = "logFC", 
                                  text.size = 2) + 
    theme_bw(base_size = 10) + 
    ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
# Combine all volcano plots into a single layout with 3 columns.
patchwork::wrap_plots(volcano_plots, ncol = 3)

# Print individual volcano plots for key contrasts.
print(volcano_plots[["RA_vs_LA_control"]])

print(volcano_plots[["RV_vs_LV_control"]])

print(volcano_plots[["AverageRegionEffect_AV_healthy"]])

# Create a combined plot of the key contrasts for easier comparison.
combined_plot <- ((volcano_plots[["RA_vs_LA_control"]]) | (volcano_plots[["AverageRegionEffect_AV_healthy"]]))
print(combined_plot)

### Volcano Plot Custom

regions <- unique(healthy_meta$Region)

# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
  stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
  # Ensure the dataframe has ENSEMBL IDs as rownames
  if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
    res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
  }
  
  # Map GENENAMEs to the dataframe using ENSEMBL IDs
  res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
  
  # Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
  res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}


# Source the helper function
source("volcano_helpers.R")

# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()

# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
  # Ensure the GeneName column is present in the dataframe for labeling
  if (!"GeneName" %in% colnames(res[[contrast_name]])) {
    # Map ENSEMBL to GeneName using the preprocessed mapping vector
    res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
  }
  
  # Generate volcano plots with and without labels using the helper function
  volcano_plots <- create_custom_volcano_plot(
    df = res[[contrast_name]],
    logFC_col = "logFC",
    pvalue_col = "P.Value",
    adj_pvalue_col = "adj.P.Val",
    contrast_name = contrast_name,
    fc_cutoff = 1,
    pvalue_cutoff = 0.05,
    save_plot = TRUE, 
    output_path = "../output/",
    show_labels = TRUE  # Always generate both labeled and unlabeled versions
  )
  
  # Store the plots in separate lists
  volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
  volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 1149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 624 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 407 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 120 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1806 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 699 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 190 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 156 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 495 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6684 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)

# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["RA_vs_LA_control"]])
## Warning: ggrepel: 380 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_no_labels[["RA_vs_LA_control"]])

print(volcano_plots_with_labels[["RV_vs_LV_control"]])
## Warning: ggrepel: 1021 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AverageRegionEffect_AV_healthy"]])
## Warning: ggrepel: 6607 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AF_vs_control_LA"]])
## Warning: ggrepel: 551 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AF_vs_control_RA"]])
## Warning: ggrepel: 1091 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

### Volcano Plot Custom (Revision)

regions <- unique(healthy_meta$Region)

# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
  stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
  # Ensure the dataframe has ENSEMBL IDs as rownames
  if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
    res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
  }
  
  # Map GENENAMEs to the dataframe using ENSEMBL IDs
  res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
  
  # Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
  res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}


# Source the helper function
source("volcano_helpers.R")

# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()

# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
  # Ensure the GeneName column is present in the dataframe for labeling
  if (!"GeneName" %in% colnames(res[[contrast_name]])) {
    # Map ENSEMBL to GeneName using the preprocessed mapping vector
    res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
  }
  
  # Generate volcano plots with and without labels using the helper function
  volcano_plots <- create_custom_volcano_plot(
    df = res[[contrast_name]],
    logFC_col = "logFC",
    pvalue_col = "P.Value",
    adj_pvalue_col = "adj.P.Val",
    contrast_name = contrast_name,
    fc_cutoff = 0,
    pvalue_cutoff = 0.05,
    save_plot = TRUE, 
    output_path = "../output/",
    show_labels = TRUE  # Always generate both labeled and unlabeled versions
  )
  
  # Store the plots in separate lists
  volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
  volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 1149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 624 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 407 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 120 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1806 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 699 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 190 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 156 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 495 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6684 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)

# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["RA_vs_LA_control"]])
## Warning: ggrepel: 380 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_no_labels[["RA_vs_LA_control"]])

print(volcano_plots_with_labels[["RV_vs_LV_control"]])
## Warning: ggrepel: 1021 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AverageRegionEffect_AV_healthy"]])
## Warning: ggrepel: 6607 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AF_vs_control_LA"]])
## Warning: ggrepel: 551 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(volcano_plots_with_labels[["AF_vs_control_RA"]])
## Warning: ggrepel: 1091 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

4 Region Specific Genes (From Bgee Pibeline)

This script identifies genes detected specifically or exclusively in different heart regions (left/right atria and ventricles) using the Bgee pipeline prescense/absence call.

# This analysis identifies genes detected specifically or exclusively in different heart regions
# (left/right atria and ventricles) using the Bgee pipeline presence/absence calls.

# Step 1: Read data based on bgee-call workflow performed on uCloud directly on raw-libraries
bgee <- fread("../../../data/count/merged_calls_bgee.tsv")
## Warning in fread("../../../data/count/merged_calls_bgee.tsv"): Detected 48
## column names but the data has 49 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
bgee <- bgee %>%
    rename(GeneID = V1)

# Step 2: Filter metadata for healthy samples
meta_reordered <- meta_reordered %>%
  filter(Group == "Control")  # Only healthy samples
meta_reordered <- meta_reordered %>%
  mutate(Sample = rownames(meta_reordered))  # Add a 'Sample' column for mapping


# Step 3: Subset bgee to include only healthy samples
healthy_samples <- rownames(meta_reordered)  # Row names in meta_reordered are the sample names
healthy_columns <- c("GeneID", healthy_samples)  # Keep GeneID and healthy sample columns
bgee_healthy <- bgee %>%
  select(all_of(healthy_columns))  # Subset bgee using healthy sample columns

# Step 4: Reshape bgee_healthy into long format and map regions
bgee_long <- bgee_healthy %>%
  pivot_longer(-GeneID, names_to = "Sample", values_to = "Presence") %>%
  left_join(meta_reordered %>% select(Sample, Region), by = "Sample") %>%  # Map regions
  mutate(Presence = as.numeric(Presence == "present"))  # Convert "present"/"absent" to binary

# Step 5: Calculate the percentage of "present" samples per region
bgee_summary <- bgee_long %>%
  group_by(GeneID, Region) %>%
  summarize(PercentPresent = mean(Presence), .groups = "drop")  # Proportion of "present" samples per region

# Step 6: Apply the 75% threshold for detection in a region
detection_threshold <- 0.75
binary_matrix <- bgee_summary %>%
  mutate(IsPresent = PercentPresent >= detection_threshold) %>%  # Binary presence/absence
  pivot_wider(names_from = Region, values_from = IsPresent, values_fill = FALSE) %>%  # Create binary matrix
  filter(rowSums(select(., -GeneID)) > 0) %>%  # Retain rows where at least one region is TRUE
  distinct(GeneID, .keep_all = TRUE) %>%  # Remove duplicates
  mutate(across(-GeneID, as.numeric))  # Convert logical values to numeric
binary_matrix <- binary_matrix %>%
  select(-PercentPresent)  # Remove the PercentPresent column


# Step 7: Annotate binary_matrix
annot_reordered <- annot_reordered %>%
  distinct(ENSEMBL, .keep_all = TRUE)  # Keep only unique rows by ENSEMBL

binary_matrix_annot <- binary_matrix %>%
  left_join(annot_reordered[, c("ENSEMBL", "GENENAME")], by = c("GeneID" = "ENSEMBL")) %>%
  mutate(GENENAME = if_else(is.na(GENENAME), GeneID, GENENAME)) %>%
  distinct(GeneID, .keep_all = TRUE)  # Ensure no duplicates after annotation

# Step 8: Update binary matrix for strict inclusion and exclusion
binary_matrix_strict <- binary_matrix_annot %>%
  mutate(
    Atrial_Specific = if_all(all_of(c("LA", "RA")), ~ . == 1) & if_all(all_of(c("LV", "RV")), ~ . == 0),
    Ventricular_Specific = if_all(all_of(c("LV", "RV")), ~ . == 1) & if_all(all_of(c("LA", "RA")), ~ . == 0),
    LA_Only = (LA == 1) & if_all(all_of(c("RA", "LV", "RV")), ~ . == 0),
    RA_Only = (RA == 1) & if_all(all_of(c("LA", "LV", "RV")), ~ . == 0),
    LV_Only = (LV == 1) & if_all(all_of(c("LA", "RA", "RV")), ~ . == 0),
    RV_Only = (RV == 1) & if_all(all_of(c("LA", "RA", "LV")), ~ . == 0),
    Left_Side_Specific = if_all(all_of(c("LA", "LV")), ~ . == 1) & if_all(all_of(c("RA", "RV")), ~ . == 0),
    Right_Side_Specific = if_all(all_of(c("RA", "RV")), ~ . == 1) & if_all(all_of(c("LA", "LV")), ~ . == 0),
    All_Four_Regions = if_all(all_of(c("LA", "RA", "LV", "RV")), ~ . == 1)
  ) %>%
  select(GeneID, Atrial_Specific, Ventricular_Specific, LA_Only, RA_Only, LV_Only, RV_Only, Left_Side_Specific, Right_Side_Specific, All_Four_Regions) %>%
  filter(rowSums(select(., -GeneID)) > 0)  # Retain genes that meet criteria in at least one category

# Generate the UpSet plot
upset_plot <- ComplexUpset::upset(
  binary_matrix_strict,
  colnames(binary_matrix_strict)[-1],  # Use the updated region-specific categories
  name = "Region-Specific Genes",
  intersections = "all",  # Show all intersections
  min_size = 5,  # Minimum number of genes per intersection
  width_ratio = 0.2,
  height_ratio = 0.4,
  themes = upset_default_themes(panel.grid.major = element_blank())
)

# Customize and display the plot
upset_plot <- upset_plot +
  ggtitle("Region-Specific Genes in Healthy Horse Hearts (Strict Logic)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"))
print(upset_plot)

# Save the plot
ggsave(
  filename = "../output/Detected_Genes_Intersections_UpSet_BGEE.png",
  plot = upset_plot,
  width = 6, height = 4, dpi = 600
)

# Step 9: Extract genes for each intersection
gene_lists <- list()

# Define updated intersection logic
intersections <- list(
  "LA_Only" = list(include = c("LA"), exclude = c("RA", "LV", "RV")),
  "RA_Only" = list(include = c("RA"), exclude = c("LA", "LV", "RV")),
  "LV_Only" = list(include = c("LV"), exclude = c("LA", "RA", "RV")),
  "RV_Only" = list(include = c("RV"), exclude = c("LA", "RA", "LV")),
  "Atrial_Specific" = list(include = c("LA", "RA"), exclude = c("LV", "RV")),
  "Ventricular_Specific" = list(include = c("LV", "RV"), exclude = c("LA", "RA")),
  "Left_Side_Specific" = list(include = c("LA", "LV"), exclude = c("RA", "RV")),
  "Right_Side_Specific" = list(include = c("RA", "RV"), exclude = c("LA", "LV")),
  "All_Four_Regions" = list(include = c("LA", "RA", "LV", "RV"), exclude = c())
)

# Extract genes for each intersection
gene_lists <- list()

for (name in names(intersections)) {
  include_regions <- intersections[[name]]$include
  exclude_regions <- intersections[[name]]$exclude
  
  # Enforce strict inclusion and exclusion criteria
  genes_in_intersection <- binary_matrix_annot %>%
    filter(if_all(all_of(include_regions), ~ . == 1)) %>%  # Present in all included regions
    filter(if_all(all_of(exclude_regions), ~ . == 0)) %>%  # Absent in all excluded regions
    select(GeneID, GENENAME)
  
  # Store the results
  gene_lists[[name]] <- genes_in_intersection
}

# Step 10: Save gene lists to Excel
output_file <- "../output/Detected_Genes_Intersections_Bgee.xlsx"
wb <- createWorkbook()

for (sheet_name in names(gene_lists)) {
  addWorksheet(wb, sheet_name)
  writeData(wb, sheet = sheet_name, x = gene_lists[[sheet_name]])
}

saveWorkbook(wb, output_file, overwrite = TRUE)
cat("Intersection lists successfully saved to:", output_file, "\n")
## Intersection lists successfully saved to: ../output/Detected_Genes_Intersections_Bgee.xlsx

5 Session Info

sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Danish_Denmark.utf8  LC_CTYPE=Danish_Denmark.utf8   
## [3] LC_MONETARY=Danish_Denmark.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Danish_Denmark.utf8    
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.1        ComplexUpset_1.3.3 ggrepel_0.9.6      RColorBrewer_1.1-3
##  [5] dplyr_1.1.4        devtools_2.4.5     usethis_3.0.0      openxlsx_4.2.7.1  
##  [9] patchwork_1.3.0    data.table_1.16.2  ggplot2_3.5.1      stringr_1.5.1     
## [13] tibble_3.2.1       magrittr_2.0.3     biomaRt_2.62.0     readxl_1.4.3      
## [17] readr_2.1.5        edgeR_4.4.0        limma_3.62.1       aamisc_0.1.5      
## [21] pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.17.1       jsonlite_1.8.9          farver_2.1.2           
##   [4] rmarkdown_2.29          ragg_1.3.3              fs_1.6.5               
##   [7] zlibbioc_1.52.0         vctrs_0.6.5             multtest_2.62.0        
##  [10] memoise_2.0.1           htmltools_0.5.8.1       progress_1.2.3         
##  [13] curl_6.0.1              cellranger_1.1.0        sass_0.4.9             
##  [16] bslib_0.8.0             htmlwidgets_1.6.4       plyr_1.8.9             
##  [19] httr2_1.0.6             cachem_1.1.0            mime_0.12              
##  [22] lifecycle_1.0.4         pkgconfig_2.0.3         Matrix_1.7-1           
##  [25] R6_2.5.1                fastmap_1.2.0           GenomeInfoDbData_1.2.13
##  [28] shiny_1.9.1             digest_0.6.37           HarmonicRegression_1.0 
##  [31] colorspace_2.1-1        AnnotationDbi_1.68.0    S4Vectors_0.44.0       
##  [34] pkgload_1.4.0           textshaping_0.4.0       RSQLite_2.3.8          
##  [37] labeling_0.4.3          filelock_1.0.3          fansi_1.0.6            
##  [40] httr_1.4.7              compiler_4.4.2          remotes_2.5.0          
##  [43] bit64_4.5.2             withr_3.0.2             rain_1.40.0            
##  [46] DBI_1.2.3               pkgbuild_1.4.5          R.utils_2.12.3         
##  [49] MASS_7.3-61             rappdirs_0.3.3          sessioninfo_1.2.2      
##  [52] tools_4.4.2             NISTunits_1.0.1         zip_2.3.1              
##  [55] httpuv_1.6.15           R.oo_1.27.0             glue_1.8.0             
##  [58] promises_1.3.0          grid_4.4.2              reshape2_1.4.4         
##  [61] generics_0.1.3          gtable_0.3.6            tzdb_0.4.0             
##  [64] R.methodsS3_1.8.2       hms_1.1.3               xml2_1.3.6             
##  [67] utf8_1.2.4              XVector_0.46.0          BiocGenerics_0.52.0    
##  [70] pillar_1.9.0            vroom_1.6.5             later_1.3.2            
##  [73] splines_4.4.2           BiocFileCache_2.14.0    lattice_0.22-6         
##  [76] survival_3.7-0          gmp_0.7-5               bit_4.5.0              
##  [79] tidyselect_1.2.1        locfit_1.5-9.10         Biostrings_2.74.0      
##  [82] miniUI_0.1.1.1          knitr_1.49              IRanges_2.40.0         
##  [85] stats4_4.4.2            xfun_0.49               Biobase_2.66.0         
##  [88] statmod_1.5.0           stringi_1.8.4           UCSC.utils_1.2.0       
##  [91] yaml_2.3.10             evaluate_1.0.1          qvalue_2.38.0          
##  [94] cli_3.6.3               systemfonts_1.1.0       xtable_1.8-4           
##  [97] munsell_0.5.1           jquerylib_0.1.4         Rcpp_1.0.13-1          
## [100] GenomeInfoDb_1.42.0     dbplyr_2.5.0            png_0.1-8              
## [103] parallel_4.4.2          ellipsis_0.3.2          blob_1.2.4             
## [106] prettyunits_1.2.0       profvis_0.4.0           urlchecker_1.0.1       
## [109] scales_1.3.0            purrr_1.0.2             crayon_1.5.3           
## [112] rlang_1.1.4             KEGGREST_1.46.0